import pandas as pd
dataset_root = './PanJiu/data/'

# corpus数据，需要进行无监督 embedding 、构建词库等操作  ，抽取日志数据中所有msg字段数据
##这段程序性能比较慢
sel_data = pd.read_csv(dataset_root + "preliminary_sel_log_dataset.csv")
sel_data.sort_values(by=['sn','time'],inplace=True)
sel_data.reset_index(drop=True, inplace=True)
sn_list = sel_data['sn'].drop_duplicates(keep='first').to_list()
tail_msg_list = ['.'.join(sel_data[sel_data['sn']==i]['msg'].tail(10).to_list()) for i in sn_list]

###获取其中一个数据多个msg字段数据合并成一个大文本段
# ' Processor CPU1_Status | IERR | Asserted. Processor CPU0_Status | IERR | Asserted. Management Subsys Health System_Health | Sensor access degraded or unavailable | Asserted. Processor CPU0_Status | IERR | Deasserted. Processor CPU1_Status | IERR | Deasserted. System ACPI Power State ACPI_PWR_Status | S4/S5: soft-off | Asserted. System ACPI Power State ACPI_PWR_Status | S0/G0: working | Asserted. Management Subsys Health System_Health | Sensor access degraded or unavailable | Deasserted. System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted'

###最好把预处理的好的数据保存起来
with open(dataset_root + "corpus.txt", "w", encoding="utf-8") as f:
    for i in tail_msg_list:
        f.write(i + "\n")